###########################
### Country year coding ###
###########################

## gle_cgdpc: GDP per Capita current prices (Gleditsch 2002)
## gle_pop: population in 1000s (Gleditsch 2002)
## gle_gdp: GDP constant (2005) dollars (Gleditsch 2002)
## p_polity2: Polity combined score (21-point, -10 to 10)
## wdi_landara: Land area in sq. km (WDI)

## Take variables from Quality of Government data (aggregation of variables listed above) ##
# Load QOG dataset

## Note: I used  "qog_std_ts_jan15.dta" but they have since updated to "qog_std_ts_jan16.dta." There should be few
## if any differences.

# QOGT <- read.dta(file = "http://www.qogdata.pol.gu.se/data/qog_std_ts_jan16.dta")
# factors <- subset(QOGT, select = c("ccodecow", "year", "gle_cgdpc", "wdi_landara", "gle_pop",  "gle_gdp", "p_polity2",
#                                    "ucdp_type1", "ucdp_type2", "ucdp_type3", "ucdp_type4"))
# write.dta(factors, file =  paste0(getwd(), "/data/factors.dta"))


# Load data from QOG
factors <- read.dta(file = paste0(getwd(), "/data/factors.dta"))

# region info for 190 states (from Lemke)
regions <- read.dta(file = paste0(getwd(),"/data/region.dta"))

# National Material Capabilities (v4.0)
cinc <- read.csv(file = paste0(getwd(),"/data/NMC_v4_0.csv"), head = TRUE, sep = ",", na = c(-9))
cinc <- subset(cinc, select = c("ccode", "year", "cinc"))

# Keep only years since 1960
cinc <- subset(cinc, year >= 1960)

# Remove missing values
cinc <- cinc[complete.cases(cinc) == TRUE,]

# Merge cinc with region data
cinc <- merge(x = cinc, y = regions, by.x = "ccode", by.y = "ccode")

# Generate variable for highest cinc score by region-year; mark this state as hegemon
cinc <- ddply(cinc, .(lemkeregion, year), transform, maxcinc = max(cinc, na.rm = TRUE))
cinc$hegemon <- ifelse(cinc$maxcinc == cinc$cinc, 1, 0)

# Test that there is exactly one hegemon per region-year (there could be ties)
# cinc <- ddply(cinc, .(lemkeregion, year), transform, test = sum(hegemon, na.rm = TRUE))
# table(cinc$test) # Looks good
# cinc$test <- NULL

# Add in hegemon polity score
pol <- subset(factors, select = c("ccodecow", "year", "p_polity2"))

cinc <- merge(x = cinc, y = pol, by.x = c("ccode", "year"), by.y = c("ccodecow", "year"), all.x = TRUE)
cinc$hegpol <- ifelse(cinc$hegemon == 1, cinc$p_polity2, NA)

# Get hegemon polity at region-year level
cinc <- ddply(cinc, .(lemkeregion, year), transform, hegpolity = mean(hegpol, na.rm = TRUE))

# Generate variable for summed cinc at region-year level
cinc <- ddply(cinc, .(lemkeregion, year), transform, totcinc = sum(cinc, na.rm = TRUE))

# Generate country-year variable for share of region cinc
cinc$regshare = cinc$cinc / cinc$totcinc

## Generate Herfindahl-Hirschman index ## 

# Sum of squared cinc shares by region
cinc <- ddply(cinc, .(lemkeregion, year), transform, herf = sum((regshare^2), na.rm = TRUE))

# Count of states in the region
cinc <- ddply(cinc, .(lemkeregion, year), transform, count = length(cinc))

# Generate normalized Herfindahl-Hirschman index (NHHI) = (HHI - 1/N) / (1 - 1/N)
cinc$NHHI <- (cinc$herf - (1 / cinc$count)) / (1 - (1 / cinc$count)) 

# Generate simpler variable for hegemon capability share
cinc$hegregshare <- cinc$maxcinc / cinc$totcinc

# Finalize cinc data
cinc.final <- subset(cinc, select = c("ccode", "lemkeregion", "year", "cinc", "regshare", "totcinc", 
                                      "count", "NHHI", "hegregshare", "hegpolity", "hegemon"))


## COW trade 3.0 variables ##

## directed dyad trade 3.0 (Barbieri and Keshk 2012) ## 
## flow1: dyadic flow from B to A (A's imports) in millions of current dollars ##
## flow2: dyadic flow from A to B (A's exports) in millions of current dollars ##

# Read and subset COW trade data
ddtrade <- read.dta(file = paste0(getwd(), "/data/cow_trade_by_directed_dyad.dta"))
ddtrade <- subset(ddtrade, year >= 1960)

# Cut hegemon info for merging with dyadic trade data
heg <- subset(cinc.final, select = c("ccode", "year", "lemkeregion", "hegemon"))

## For each country:
## Code (1) intra-region trade, (2) inter-region trade
## (3) intra-region trade excl. hegemon, (4) inter-region trade excl. hegemon

# Merge dyad trade with cut hegemon and region data
dtrade.m <- merge(x = ddtrade, y = heg, by.x = c("ccode1", "year"), by.y = c("ccode", "year"))
dtrade.m <- rename(dtrade.m, c("lemkeregion" = "lemkeregion1", "hegemon" = "hegemon1"))

dtrade.m <- merge(x = dtrade.m, y = heg, by.x = c("ccode2", "year"), by.y = c("ccode", "year"))
dtrade.m <- rename(dtrade.m, c("lemkeregion" = "lemkeregion2", "hegemon" = "hegemon2"))

# Create total trade variable
dtrade.m$tottrade <- ifelse(is.na(dtrade.m$flow1),  dtrade.m$flow2, 
                            ifelse(is.na(dtrade.m$flow2), dtrade.m$flow1, 
                                   dtrade.m$flow1 + dtrade.m$flow2))


# Remove unneccesary variables (save space)
dtrade.m <- subset(dtrade.m, select = c("ccode1", "ccode2", "year", "lemkeregion1", "lemkeregion2", "hegemon1", "hegemon2", "tottrade"))

# Load CEPII data for distance etc.
cepii <- read.dta(file = paste0(getwd(), "/data/dist_cepii.dta"))

# Add COW country code
cepii$ccode1 <- countrycode(cepii$iso_o, "iso3c", "cown")
cepii$ccode2 <- countrycode(cepii$iso_d, "iso3c", "cown")

# Keep capital distance, contiguity, and common language
cepii <- subset(cepii, select = c("ccode1", "ccode2", "contig", "distcap", "comlang_ethno"), is.na(ccode1) == FALSE & is.na(ccode2) == FALSE & ccode1 != ccode2)

# merge dtrade.m with cepii
dtrade.m <- merge(x = dtrade.m, y = cepii, by.x = c("ccode1", "ccode2"), by.y = c("ccode1", "ccode2"))

## Conditional dyadic trade values ##
# All states
dtrade.m$dtradeintra.all <- ifelse(dtrade.m$lemkeregion1 == dtrade.m$lemkeregion2, dtrade.m$tottrade, NA)
dtrade.m$dtradeinter.all <- ifelse(dtrade.m$lemkeregion1 != dtrade.m$lemkeregion2, dtrade.m$tottrade, NA)

# No hegemon dyads
dtrade.m$dtradeintra.noheg <- ifelse(dtrade.m$lemkeregion1 == dtrade.m$lemkeregion2 & dtrade.m$hegemon2 == 0, dtrade.m$tottrade, NA)
dtrade.m$dtradeinter.noheg <- ifelse(dtrade.m$lemkeregion1 != dtrade.m$lemkeregion2 & dtrade.m$hegemon2 == 0, dtrade.m$tottrade, NA)

# Sum conditional dyadic trade values by ccode1 and year, create new data frame
cyear <- ddply(dtrade.m, .(ccode1, year), summarize, intratrade.all = sum(dtradeintra.all, na.rm = TRUE), 
               intertrade.all = sum(dtradeinter.all, na.rm = TRUE), intratrade.nh = sum(dtradeintra.noheg, na.rm = TRUE),
               intertrade.nh = sum(dtradeinter.noheg, na.rm = TRUE), avdist = mean(distcap), countcontig = sum(contig),
               countlang = sum(comlang_ethno))

## Merge everything together

# Load constant dollar info
constant <- read.dta(file = paste0(getwd(), "/data/Current_Constant_DIVIDE.dta"))

final.data <- merge(x = cyear, y = cinc.final, by.x = c("ccode1", "year"), by.y = c("ccode", "year"))
final.data <- merge(x = final.data, y = factors, by.x = c("ccode1", "year"), by.y = c("ccodecow", "year"))
final.data <- merge(x = final.data, y = constant, by.x = c("year"), by.y = c("year"))

## Create final variables
# Logged trade variable, adding one to take care of zeros (which are rarer at the country-level)
final.data$log.intratrade.all <- log((final.data$intratrade.all/final.data$constant) + 1)
final.data$log.intratrade.nh <- log((final.data$intratrade.nh/final.data$constant) + 1)
final.data$log.intertrade.all <- log((final.data$intertrade.all/final.data$constant) + 1)
final.data$log.intertrade.nh <- log((final.data$intertrade.nh/final.data$constant) + 1)

# Logged real GDP
final.data$log.gdp <- log(final.data$gle_gdp)

# Logged population variable
final.data$log.pop <- log(final.data$gle_pop)

final.data$conflict.dum <- ifelse(final.data$ucdp_type1 > 0 | final.data$ucdp_type2 > 0 | 
                                    final.data$ucdp_type3 > 0 | final.data$ucdp_type4 > 0, 1, 0)
final.data$conflict.dum[is.na(final.data$conflict.dum) == TRUE] <- 0

## Lead DV ##

# Order data
final.data <- final.data[order(final.data$ccode1, final.data$year),]

# Lead variables (log trade values)
final.data <- slide(final.data, Var = "log.intratrade.all", GroupVar = "ccode1", slideBy = 1)
final.data <- slide(final.data, Var = "log.intratrade.nh", GroupVar = "ccode1", slideBy = 1)
final.data <- slide(final.data, Var = "log.intertrade.all", GroupVar = "ccode1", slideBy = 1)
final.data <- slide(final.data, Var = "log.intertrade.nh", GroupVar = "ccode1", slideBy = 1)

## Proportion of intra-region trade ##
# Excl trade with hegemon
final.data$pertrade.nh1 <- exp(final.data$log.intratrade.nh1) / (exp(final.data$log.intratrade.nh1) + exp(final.data$log.intertrade.nh1))
# Incl. trade with hegemon
final.data$pertrade.all1 <- exp(final.data$log.intratrade.all1) / (exp(final.data$log.intratrade.all1) + exp(final.data$log.intertrade.all1))

# Factors for manual twoway fe
final.data$fyear <- as.factor(final.data$year)
final.data$fccode1 <- as.factor(final.data$ccode1)

